Filter criteria:
Filter differentially expressed genes between autism and control (adj. p-value < 0.05 and lfc<log2(1.2))
No samples are removed based on network connectivity z-scores
Preprocessing:
VST Normalisation
DESeq2 DEA
# # Load csvs
# datExpr = read.csv('./../raw_data/RNAseq_ASD_datExpr.csv', row.names=1)
# datMeta = read.csv('./../raw_data/RNAseq_ASD_datMeta.csv')
#
# # Make sure datExpr and datMeta columns/rows match
# rownames(datMeta) = paste0('X', datMeta$Dissected_Sample_ID)
# if(!all(colnames(datExpr) == rownames(datMeta))){
# print('Columns in datExpr don\'t match the rowd in datMeta!')
# }
#
# # Annotate probes
# getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
# 'end_position','strand','band','gene_biotype','percentage_gc_content')
# mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
# dataset='hsapiens_gene_ensembl',
# host='feb2014.archive.ensembl.org') ## Gencode v19
# datProbes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
# datProbes = datProbes[match(rownames(datExpr), datProbes$ensembl_gene_id),]
# datProbes$length = datProbes$end_position-datProbes$start_position
#
# # Group brain regions by lobes
# datMeta$Brain_Region = as.factor(datMeta$Region)
# datMeta$Brain_lobe = 'Occipital'
# datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45')] = 'Frontal'
# datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA3_1_2_5', 'BA7')] = 'Parietal'
# datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22')] = 'Temporal'
# datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital'))
#
# #################################################################################
# # FILTERS:
#
# # 1 Filter probes with start or end position missing (filter 5)
# # These can be filtered without probe info, they have weird IDS that don't start with ENS
# to_keep = !is.na(datProbes$length)
# datProbes = datProbes[to_keep,]
# datExpr = datExpr[to_keep,]
# rownames(datProbes) = datProbes$ensembl_gene_id
#
# # 2. Filter samples from ID AN03345 (filter 2)
# to_keep = (datMeta$Subject_ID != 'AN03345')
# datMeta = datMeta[to_keep,]
# datExpr = datExpr[,to_keep]
#
# # 3. Filter samples with rowSums <= 40
# to_keep = rowSums(datExpr)>40
# datExpr = datExpr[to_keep,]
# datProbes = datProbes[to_keep,]
#
# if(!file.exists('./../working_data/genes_ASD_DE_info_DESeq2.csv')){
# counts = as.matrix(datExpr)
# rowRanges = GRanges(datProbes$chromosome_name,
# IRanges(datProbes$start_position, width=datProbes$length),
# strand=datProbes$strand,
# feature_id=datProbes$ensembl_gene_id)
#
# se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
# ddsSE = DESeqDataSet(se, design =~Diagnosis_)
#
# dds = DESeq(ddsSE)
# DE_info = results(dds) %>% data.frame %>% rownames_to_column(var = 'ID') %>%
# mutate('logFC_DESeq2'=log2FoldChange, 'adj.P.Val_DESeq2'=padj) %>%
# dplyr::select(ID, logFC_DESeq2, adj.P.Val_DESeq2)
#
# write.csv(DE_info_DESeq2, './../working_data/genes_ASD_DE_info_DESeq2.csv', row.names = FALSE)
#
# rm(counts, rowRanges, se, ddsSE, dds, mart)
#
# } else DE_info = read.csv('./../working_data/genes_ASD_DE_info_DESeq2.csv')
#
# save(file='./../working_data/RNAseq_ASD_4region_DEgenes_vst_DESeq2.Rdata', datExpr, datMeta, datProbes, DE_info)
load('./../working_data/RNAseq_ASD_4region_DEgenes_vst_DESeq2.Rdata')
# Scale gene expression values
# datExpr = datExpr %>% t %>% scale %>% t %>% data.frame
#################################################################################
# SFARI Genes
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'),
values=SFARI_genes$`gene-symbol`, mart=mart) %>%
mutate('gene-symbol'=hgnc_symbol, 'ID'=as.character(ensembl_gene_id)) %>%
dplyr::select('ID', 'gene-symbol')
SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')
#################################################################################
# Add functional annotation to genes from GO
GO_annotations = read.csv('./../working_data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID' = as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal' = 1)
datExpr_backup = datExpr
# Filter DE genes
datExpr = datExpr[DE_info$adj.P.Val_DESeq2<0.05 & DE_info$logFC_DESeq2>log2(1.2),]
rm(mart, gene_names, GO_annotations)
We have less genes than before (~half)
glue('Number of genes: ', nrow(datExpr), '\n',
'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 1726
## Number of samples: 86 (51 ASD, 35 controls)
First principal component explains 97% of the total variance
reduce_dim_datExpr = function(datExpr, datMeta, var_explained=0.98){
datExpr = log2(datExpr+1)
datExpr_pca = prcomp(datExpr, scale.=TRUE)
last_pc = data.frame(summary(datExpr_pca)$importance[3,]) %>% rownames_to_column(var='ID') %>%
filter(.[[2]] >= var_explained) %>% top_n(-1) %>% dplyr::select(ID)
par(mfrow=c(1,2))
plot(summary(datExpr_pca)$importance[2,], type='b')
abline(v=substr(last_pc$ID, 3, nchar(last_pc$ID)), col='blue')
plot(summary(datExpr_pca)$importance[3,], type='b')
abline(h=var_explained, col='blue')
print(glue('Keeping top ', substr(last_pc$ID, 3, nchar(last_pc$ID)), ' components that explain ',
var_explained*100, '% of the variance'))
datExpr_top_pc = datExpr_pca$x %>% data.frame %>% dplyr::select(PC1:last_pc$ID)
return(list('datExpr'=datExpr_top_pc, 'pca_output'=datExpr_pca))
}
reduce_dim_output = reduce_dim_datExpr(datExpr, datMeta)
## Keeping top 4 components that explain 98% of the variance
datExpr_redDim = reduce_dim_output$datExpr
pca_output = reduce_dim_output$pca_output
rm(datSeq, datProbes, reduce_dim_datExpr, reduce_dim_output)
It’s more difficult to visualise two clouds of samples now
datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2)) + geom_point(alpha=0.2) + theme_minimal()
Projecting all the original points into the space created by the two principal components and colouring by the differential expression p-value we can see that the points in the middle of the two clouds were filtered out because their DE wasn’t statistically significant. Colouring by their log2 fold change we can see that the genes from the cloud on the top are overexpressed and the genes in the bottom one underexpressed.
datMeta_backup = datMeta # So datMeta doesn't get replaced with unfiltered version
load('./../working_data/RNAseq_ASD_4region_normalized_vst.Rdata')
datMeta = datMeta_backup
ASD_pvals = DE_info$adj.P.Val
log_fold_change = DE_info$logFC
pca_data_projection = scale(datExpr) %*% pca_output$rotation %>% data.frame %>% filter(rownames(.) %in% DE_info$ID)
p1 = pca_data_projection %>% ggplot(aes(x=PC1, y=PC2, color=ASD_pvals)) + geom_point(alpha=0.5) +
theme_minimal() + theme(legend.position='bottom')
p2 = pca_data_projection %>% ggplot(aes(x=PC1, y=PC2, color=log_fold_change)) + geom_point() +
theme_minimal() + theme(legend.position='bottom') + scale_colour_gradient2()
grid.arrange(p1, p2, ncol=2)
rm(ASD_pvals, log_fold_change, top_genes, datExpr, datProbes, datMeta_backup, p1, p2)
Only 159 of our 3071 genes appear on the SFARI list, losing all 23/25 genes with a score of 1
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv',
col_types = cols(`number-of-reports` = col_integer(), syndromic = col_logical()))
Gene Score count considering all genes:
SFARI_genes$`gene-score` %>% table
## .
## 1 2 3 4 5 6
## 25 62 195 454 175 25
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'),
values=SFARI_genes$`gene-symbol`, mart=mart) %>% mutate('gene-symbol'=hgnc_symbol) %>%
dplyr::select('ensembl_gene_id', 'gene-symbol')
SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')
gene_scores = SFARI_genes %>% dplyr::select(ensembl_gene_id, `gene-score`, syndromic) %>%
right_join(gene_names, by='ensembl_gene_id') %>% dplyr::select(-'gene-symbol')
# Full (ordered) list of datExpr genes with their scores
gene_scores = data.frame('ensembl_gene_id'=rownames(datExpr_redDim)) %>%
left_join(gene_scores, by = 'ensembl_gene_id') %>%
mutate('syndromic' = ifelse(syndromic==FALSE | is.na(syndromic), FALSE, TRUE),
'gene-bool' = ifelse(is.na(`gene-score`), FALSE, TRUE))
rm(SFARI_genes, mart, gene_names)
Gene score count for genes in datExpr
# gene_scores = gene_scores %>% mutate(`gene-score` = ifelse(is.na(`gene-score`) &
# ensembl_gene_id %in% GO_neuronal$ID, 'Neuronal', `gene-score`))
gene_scores$`gene-score` %>% table
## .
## 1 2 3 4 5 6
## 2 5 20 53 30 2
clusterings = list()
clusterings[['SFARI_score']] = gene_scores$`gene-score`
names(clusterings[['SFARI_score']]) = gene_scores$ensembl_gene_id
clusterings[['SFARI_bool']] = gene_scores$`gene-bool`
names(clusterings[['SFARI_bool']]) = gene_scores$ensembl_gene_id
clusterings[['syndromic']] = gene_scores$syndromic
names(clusterings[['syndromic']]) = gene_scores$ensembl_gene_id
set.seed(123)
wss = sapply(1:10, function(k) kmeans(datExpr_redDim, k, iter.max=100, nstart=25,
algorithm='MacQueen')$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 3
abline(v = best_k, col='blue')
datExpr_k_means = kmeans(datExpr_redDim, best_k, iter.max=100, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster
Chose k=5 as best number of clusters. SFARI genes seem to group in the last two clusters
h_clusts = datExpr_redDim %>% dist %>% hclust
# plot(h_clusts, hang = -1, cex = 0.6, labels=FALSE)
best_k = 7
clusterings[['hc']] = cutree(h_clusts, best_k)
create_viridis_dict = function(){
min_score = clusterings[['SFARI_score']] %>% min(na.rm=TRUE)
max_score = clusterings[['SFARI_score']] %>% max(na.rm=TRUE)
viridis_score_cols = viridis(max_score - min_score + 1)
names(viridis_score_cols) = seq(min_score, max_score)
return(viridis_score_cols)
}
viridis_score_cols = create_viridis_dict()
dend_meta = gene_scores[match(labels(h_clusts), gene_scores$ensembl_gene_id),] %>%
mutate('SFARI_score' = viridis_score_cols[`gene-score`], # Purple: 2, Yellow: 6
'SFARI_bool' = ifelse(`gene-bool` == T, '#21908CFF', 'white'), # Acqua
'Syndromic' = ifelse(syndromic == T, 'orange', 'white'),
'Neuronal' = ifelse(ensembl_gene_id %in% GO_neuronal$ID, 'gray','white')) %>%
dplyr::select(SFARI_score, SFARI_bool, Syndromic, Neuronal)
h_clusts %>% as.dendrogram %>% set('labels', rep('', nrow(datMeta))) %>%
set('branches_k_color', k=best_k) %>% plot
colored_bars(colors=dend_meta)
Samples are grouped into two big clusters, two small clusters and two outliers, the first big cluster has one main subcluster, two small subclusters and three outliers, and the second one has one main subcluster, one small one and three groups of outliers.
*Output plots in clustering_genes_04_12 folder
Following this paper’s guidelines:
Run PCA and keep enough components to explain 60% of the variance
Run ICA with that same number of nbComp as principal components kept to then filter them
Select components with kurtosis > 3
Assign genes to clusters with FDR<0.01 using the fdrtool package
ICA_output = datExpr_redDim %>% runICA(nbComp=ncol(datExpr_redDim), method='JADE')
signals_w_kurtosis = ICA_output$S %>% data.frame %>% dplyr::select(names(apply(ICA_output$S, 2, kurtosis)>3))
ICA_clusters = apply(signals_w_kurtosis, 2, function(x) fdrtool(x, plot=F)$qval<0.01) %>% data.frame
ICA_clusters = ICA_clusters[colSums(ICA_clusters)>1]
clusterings[['ICA_min']] = rep(NA, nrow(ICA_clusters))
ICA_clusters_names = colnames(ICA_clusters)
for(c in ICA_clusters_names) clusterings[['ICA_min']][ICA_clusters[,c]] = c
clusterings[['ICA_NA']] = is.na(clusterings[['ICA_min']])
Leaves ALL observations without cluster!
ICA_clusters %>% rowSums %>% table
## .
## 0
## 1726
# ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2, Var1)) +
# geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') +
# theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()
The soft R-squared value starts in 0.5 but then becomes tiny and needs a power of over 200 to recover. Taking power=1
best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = seq(1, 500, by=50))
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.330000 1.46000 0.699 1540 1620 1630
## 2 51 0.024300 0.11500 0.955 686 833 1050
## 3 101 0.000311 -0.00951 0.950 491 579 853
## 4 151 0.029600 -0.08040 0.938 389 439 734
## 5 201 0.092500 -0.13200 0.948 323 351 649
## 6 251 0.177000 -0.18000 0.914 277 290 585
## 7 301 0.273000 -0.21000 0.940 243 246 534
## 8 351 0.383000 -0.24500 0.949 217 212 493
## 9 401 0.444000 -0.27200 0.910 196 186 458
## 10 451 0.532000 -0.29700 0.954 179 165 429
network = datExpr_redDim %>% t %>% blockwiseModules(power=1, numericLabels=TRUE)
## mergeCloseModules: less than two proper modules.
## ..color levels are 0, 1
## ..there is nothing to merge.
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)
It only leaves 39 genes without a cluster but classifies the rest as a single cluster:
clusterings[['WGCNA']] %>% table
## .
## 0 1
## 39 1687
The lowest BIC is achieved at 14 GM
n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=40, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')
best_k = 14
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)
Plot of clusters with their centroids in gray
gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())
# Clean up the environment a bit
rm(wss, datExpr_k_means, h_clusts, cc_output, cc_output_c1, cc_output_c2, best_k, ICA_output,
ICA_clusters_names, signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network,
best_power, c, manual_clusters, manual_clusters_data, c1_sd, c2_sd, c1_mean, c2_mean,
gmm_c1_sd, gmm_c2_sd,gmm_c1_sd, gmm_c1_mean, gmm_c2_mean, p1, p2, pca_data_projection, dend_meta,
plot_gaussians, plot_points, n_clusters, viridis_score_cols, gg_colour_hue, create_viridis_dict)
Using Adjusted Rand Index:
Clusterings seem to give very different results and none resemble the SFARI scores
K-means and Hierarchical clustering and GMM are the only ones that are similar between them
cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
cluster1 = as.factor(clusterings[[i]])
for(j in (i):length(clusterings)){
cluster2 = as.factor(clusterings[[j]])
cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
}
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)
cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none',
cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE,
cexRow = 1, cexCol = 1, margins = c(7,7))
rm(i, j, cluster1, cluster2, cluster_sim)
All clusterings consider only the 1st component
WGCNA doesn’t work well (classifies almost everything as a single class)
SFARI genes seem to be everywhere (perhaps a bit more concentrated in low PC1 values)
1st PC seems to reflect the average level of expression of the genes
plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
mutate(ID = rownames(.), k_means = as.factor(clusterings[['km']]),
hc = as.factor(clusterings[['hc']]), cc_l1 = as.factor(clusterings[['cc_l1']]),
#cc_l2 = as.factor(clusterings[['cc_l2']]), ica = as.factor(clusterings[['ICA_min']]),
n_ica = as.factor(rowSums(ICA_clusters)), gmm = as.factor(clusterings[['GMM']]),
#gmm_2 = as.factor(clusterings[['GMM_2']]),
wgcna = as.factor(clusterings[['WGCNA']]),
#manual = as.factor(clusterings[['Manual']]), manual_mean = as.factor(clusterings[['Manual_mean']]),
#manual_sd = as.factor(clusterings[['Manual_sd']]),
SFARI = as.factor(clusterings[['SFARI_score']]),
SFARI_bool = as.factor(clusterings[['SFARI_bool']]), syndromic = as.factor(clusterings[['syndromic']])) %>%
bind_cols(DE_info[DE_info$ID %in% rownames(datExpr_redDim),]) %>%
mutate(avg_expr = log2(rowMeans(datExpr_backup)+1)[rownames(datExpr_backup) %in% rownames(datExpr_redDim)])
rownames(plot_points) = plot_points$ID
selectable_scatter_plot(plot_points, plot_points)